import pandas as pd
from textblob import TextBlob
from wordcloud import WordCloud
from collections import Counter
import json
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly import graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
df = pd.read_csv('Data/cleaned_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()
df.info()
df['created_at'] = pd.to_datetime(df['created_at'])
df.head()
import warnings
warnings.filterwarnings("ignore")
temp = df[['created_at']]
temp['created_at'] = temp['created_at'].dt.date
temp = temp['created_at'].value_counts().reset_index()
temp.sort_values(by='index', inplace=True)
temp.head()
fig = plt.figure(figsize=(20,10))
ax = sns.barplot(x="index", y="created_at", hue="index", data=temp, dodge=False)
df.dropna(subset=['clean_text'], inplace=True)
df.drop_duplicates('full_text', inplace=True)
fav = df[['retweet_count','full_text']].sort_values('retweet_count',ascending = False)[:5].reset_index()
for i in range(5):
print('{}). {} Counts\n==> {}\n'.format(i+1, fav['retweet_count'][i], fav['full_text'][i]))
fav = df[['favorite_count','full_text']].sort_values('favorite_count',ascending = False)[:5].reset_index()
for i in range(5):
print('{}). {} Counts\n==> {}\n'.format(i+1, fav['favorite_count'][i], fav['full_text'][i]))
def getSubjectivity(text):
return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
return TextBlob(text).sentiment.polarity
def analyseSentiment(score):
if score < 0:
return 'Negative'
elif score ==0:
return 'Neutral'
else:
return 'Positive'
df['Subjectivity'] = df['clean_text'].apply(getSubjectivity)
df['Polarity'] = df['clean_text'].apply(getPolarity)
df['Sentiment'] = df['Polarity'].apply(analyseSentiment)
temp = df.groupby('Sentiment').count()['clean_text'].reset_index().sort_values(by='clean_text',ascending=False)
temp.style.background_gradient(cmap='Purples')
fig = px.bar(temp,
x='Sentiment', y='clean_text',
title='Sentiment Analysis',
labels={'Sentiment':'Sentimetns', 'clean_text':'Tweets count'}
)
fig.show()
fig = go.Figure(go.Funnelarea(
text =temp.Sentiment,
values = temp.clean_text,
title = {"position": "top center", "text": "Funnel-Chart of Sentiment Distribution"}
))
fig.show()
plt.figure(1, figsize=(10,6))
plt.hist(df["created_at"],bins = 100);
plt.xlabel('Hours',size = 15)
plt.ylabel('No. of Tweets',size = 15)
plt.title('No. of Tweets per Hour',size = 15)
plt.figure(figsize=(10,6))
sns.distplot(df['Polarity'], bins=30)
plt.title('Sentiment Distribution',size = 15)
plt.xlabel('Polarity',size = 15)
plt.ylabel('Frequency',size = 15)
plt.show();
words = []
words = [word for i in df.clean_text for word in i.split()]
freq = Counter(words).most_common(30)
freq = pd.DataFrame(freq)
freq.columns = ['word', 'frequency']
freq.head()
plt.figure(figsize = (15, 10))
sns.barplot(y="word", x="frequency",data=freq)
neg_df = df[df['Sentiment'] == 'Negative']
neu_df = df[df['Sentiment'] == 'Neutral']
pos_df = df[df['Sentiment'] == 'Positive']
allWords = ''.join([twts for twts in df['clean_text']])
wordCloud = WordCloud(width=1000, height=700, background_color="white", random_state=21, max_font_size=119).generate(allWords)
wordCloud.to_file('All_Data.png')
plt.figure(num=None, figsize=(25, 8), dpi=180, edgecolor='k')
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()
allWords = ''.join([twts for twts in neg_df['clean_text']])
wordCloud = WordCloud(width=1000, height=700, background_color="white", random_state=21, max_font_size=119).generate(allWords)
wordCloud.to_file('ngwords_wordcloud.png')
plt.figure(num=None, figsize=(25, 8), dpi=180, edgecolor='k')
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()
allWords = ''.join([twts for twts in neu_df['clean_text']])
wordCloud = WordCloud(width=1000, height=700, background_color="white", random_state=21, max_font_size=119).generate(allWords)
wordCloud.to_file('neuwords_wordcloud.png')
plt.figure(num=None, figsize=(25, 8), dpi=180, edgecolor='k')
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()
allWords = ''.join([twts for twts in pos_df['clean_text']])
wordCloud = WordCloud(width=1000, height=700, background_color="white", random_state=21, max_font_size=119).generate(allWords)
wordCloud.to_file('poswords_wordcloud.png')
plt.figure(num=None, figsize=(25, 8), dpi=180, edgecolor='k')
plt.imshow(wordCloud, interpolation="bilinear")
plt.axis('off')
plt.show()
df_covid = pd.read_csv('Data/covid_cases.csv')
df_covid.drop('Unnamed: 0', axis=1, inplace=True)
start_date = df['created_at'][0]
end_date = df['created_at'][len(df)-1]
df_covid['Date'] = pd.to_datetime(df_covid['Date'])
mask = (df_covid['Date'] >= start_date) & (df_covid['Date'] <= end_date)
covid_cases = df_covid.loc[mask]
len(covid_cases)
temp = covid_cases.groupby('Date').sum()
temp.reset_index(inplace=True)
fig = go.Figure(data=[
go.Bar(x=temp['Date'], y=temp['Confirmed'], name='Confirmed Cases'),
go.Bar(x=temp['Date'], y=temp['Recovered'], name='Recovered Cases'),
go.Bar(x=temp['Date'], y=temp['Deaths'], name='Deaths Cases')
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.show()
temp = covid_cases.groupby('Country/Region').sum()
temp.reset_index(inplace=True)
top_confirmed_cases = temp.sort_values(by='Confirmed', ascending=False)
top_recovered_Cases = temp.sort_values(by='Recovered', ascending=False)
top_deaths_cases = temp.sort_values(by='Deaths', ascending=False)
fig = make_subplots(rows=1, cols=3,
shared_yaxes=True,
horizontal_spacing = 0.01,
subplot_titles=('Top 5 Countries in Cconfirmed',
'Top 5 Countries in Recovered',
'Top 5 Countries in Deaths'))
fig.add_trace(go.Bar(x=top_confirmed_cases['Country/Region'][:6],
y=top_confirmed_cases['Confirmed'][:6],
marker=dict(color=[4, 5, 6], coloraxis="coloraxis")),
1, 1)
fig.add_trace(go.Bar(x=top_recovered_Cases['Country/Region'][:6],
y=top_recovered_Cases['Recovered'][:6],
marker=dict(color=[2, 3, 5], coloraxis="coloraxis")),
1, 2)
fig.add_trace(go.Bar(x=top_deaths_cases['Country/Region'][:6],
y=top_deaths_cases['Deaths'][:6],
marker=dict(color=[2, 3, 5], coloraxis="coloraxis")),
1, 3)
fig.update_layout(coloraxis=dict(colorscale='Bluered_r'), showlegend=False)
fig.show()
hashtags = []
for i in df['hashtags']:
temp = i.replace('\'', "").strip('][').split(', ')
if '' not in temp:
hashtags.extend(temp)
temp = pd.DataFrame.from_dict({
'Hashtags':hashtags,
})
temp.head()
temp = temp['Hashtags'].value_counts().reset_index()
fig = plt.figure(figsize=(20,10))
ax = sns.barplot(x="index", y="Hashtags", hue="index", data=temp[:10], dodge=False)
df.head()
covid_cases.head()
df.to_csv('data/analysed_data.csv')
covid_cases.to_csv('data/covid_stats.csv')